#Alexander F. Gazmararian
#afg2@princeton.edu

library(tidyverse)
library(survey)
library(tidycensus)
library(here)
library(kableExtra)

#Load custom functions
source(here("code", "fun", "fix_center.r"))

#load survey data
g <- readRDS(here("data", "fair_wide.rds"))

#load census data----
acs.vars <- tidycensus::load_variables(2018, "acs5")
#sex by age by education
SexAgeEdu <- subset(acs.vars, concept == "SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 18 YEARS AND OVER")
SexAgeEdu <- SexAgeEdu[stringr::str_count(SexAgeEdu$label, "!") == 8, ]
#household income
income <- subset(acs.vars, concept == "HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2018 INFLATION-ADJUSTED DOLLARS)")
income <- income[-1,]
#sex by age by white
RaceAgeSex <- subset(acs.vars, concept %in% c("SEX BY AGE (WHITE ALONE)", "SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)",
                                              "SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)", "SEX BY AGE (ASIAN ALONE)",
                                              "SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)", "SEX BY AGE (SOME OTHER RACE ALONE)",
                                              "SEX BY AGE (TWO OR MORE RACES)"))
RaceAgeSex <- RaceAgeSex[stringr::str_count(RaceAgeSex$label, "!")==6,]
##B01003_001==total population
#"B25100_002", "B25100_008"
getvars <- c("B02001_002", "B01003_001", SexAgeEdu$name, income$name, RaceAgeSex$name)
#download census data
acs_long <- tidycensus::get_acs("county", county = "Greene County", state = "PA", variables = getvars, year = 2018, sumfile = "acs5")
#joint distribution of sex,age,and education
acs.SexAgeEdu <- subset(acs_long, variable %in% SexAgeEdu$name)
acs.SexAgeEdu <- merge(acs.SexAgeEdu, SexAgeEdu, by.x = "variable", by.y = "name")
joint <-
  acs.SexAgeEdu %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "gender", "age_acs", "edu"), sep = "!!")
joint <- subset(joint, select = -c(concept, estimate2, total, geography, NAME, GEOID))
joint <-
  joint %>%
  dplyr::mutate(
    edu_acs = dplyr::case_when(
      edu %in% c("Less than 9th grade", "9th to 12th grade, no diploma", "High school graduate (includes equivalency)") ~ "No college",
      edu %in% c("Associate's degree", "Some college, no degree", "Bachelor's degree", "Graduate or professional degree") ~ "College"
    )
  )
joint <-
  joint %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
joint$Freq <- joint$estimate / sum(joint$estimate)
joint$estimate <- NULL
#joint distribution of race, age, and sex
totalpop <- subset(acs_long, variable == "B01003_001")
acs.RaceAgeSex <- subset(acs_long, variable %in% RaceAgeSex$name)
acs.RaceAgeSex <- merge(acs.RaceAgeSex, RaceAgeSex, by.x = "variable", by.y = "name")
race.dist <-
  acs.RaceAgeSex %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "gender", "age_acs"), sep = "!!") %>%
  dplyr::select(-c(variable, GEOID, NAME, moe, estimate2, total, geography))
race.dist$concept <- ifelse(race.dist$concept == "SEX BY AGE (WHITE ALONE)", "white", "nonwhite")
names(race.dist)[4] <- "race_acs"
##adjust age
race.dist <-
  race.dist %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age_acs %in% c("18 and 19 years", "20 to 24 years") ~ "18 to 24 years",
      age_acs %in% c("25 to 29 years", "30 to 34 years") ~ "25 to 34 years",
      age_acs %in% c("35 to 44 years") ~ "35 to 44 years",
      age_acs %in% c("45 to 54 years", "55 to 64 years") ~ "45 to 64 years",
      age_acs %in% c("65 to 74 years", "75 to 84 years", "85 years and over") ~ "65 years and over",
      T ~ NA_character_
    )
  )
race.dist <- subset(race.dist, !is.na(age_acs))
race.dist <-
  race.dist %>%
  dplyr::group_by(gender, age_acs, race_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
race.dist$Freq <- race.dist$estimate / sum(race.dist$estimate)
#race.dist$Freq <- race.dist$Freq * nrow(d)
race.dist$estimate <- NULL
#prepare income distribution
acs.income <- subset(acs_long, variable %in% income$name)
acs.income <- merge(acs.income, income, by.x = "variable", by.y = "name")
income.dist <-
  acs.income %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "income"), sep = "!!")
income.dist <- subset(income.dist, select = -c(concept, estimate2, total, geography, NAME, GEOID))
income.dist <-
  income.dist %>%
  dplyr::mutate(
    income_acs = dplyr::case_when(
      income %in% c("Less than $10,000", "$10,000 to $14,999", "$15,000 to $19,999") ~ "Less than $20,000",
      income %in% c("$20,000 to $24,999", "$25,000 to $29,999", "$30,000 to $34,999", "$35,000 to $39,999") ~ "$20,000 to $39,999",
      income %in% c("$40,000 to $44,999", "$45,000 to $49,999", "$50,000 to $59,999") ~ "$40,000 to $59,999",
      income %in% c("$60,000 to $74,999", "$75,000 to $99,999") ~ "$60,000 - $99,999",
      income %in% c("$100,000 to $124,999", "$125,000 to $149,999", "$150,000 to $199,999", "$200,000 or more") ~ "$100,000 or more",
      T ~ NA_character_
    )
  )
income.dist <-
  income.dist %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
income.dist$Freq <- income.dist$estimate / sum(income.dist$estimate)
income.dist$estimate <- NULL
#prepare survey income distribution
g <-
  g %>%
  dplyr::mutate(
    income_acs = dplyr::case_when(
      income %in% c("$60,000 to $79,999", "$80,000 to $99,999") ~ "$60,000 - $99,999",
      income %in% c("$100,000 to $119,999", "$120,000 to $139,999", "$140,000 to $159,999", "$160,000 or more") ~ "$100,000 or more",
      T ~ as.character(income)
    )
  )
#impute missing data with most common category
g %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::arrange(desc(n))
g$income_acs <- ifelse(is.na(g$income_acs), "$60,000 - $99,999", g$income_acs)

#prep survey data for age variable
#prep race variable
g$race_acs <- ifelse(g$race == "White/Caucasian", "white", "nonwhite")
#prep gender variable
g$gender <- ifelse(g$sex == 1, "Female", "Male")
#impute missing gender with most common gender
g$gender <- ifelse(is.na(g$gender),  "Female", g$gender)

#prep highest ed variable
g$edu_acs <- with(g, ifelse(edu %in% c("Bachelor's Degree (for example; BA, BBA, BS)",
                                       "Bachelor's Degree (for example: BA, BBA, BS)", "Doctorate (for example: PhD, EdD)",
                                       "Master's Degree (for example: MA, MS, MEng)", "Professional Degree (for example: MD, DDS, JD)"), "College", "No college"))
table(g$edu_acs)
g <-
  g %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age == "18-24" ~ "18 to 24 years",
      age == "25-34" ~ "25 to 34 years",
      age == "35-44" ~ "35 to 44 years",
      age %in% c("45-54", "55-64") ~ "45 to 64 years",
      age == "65 or older" ~ "65 years and over"
    )
  )
#check joint distribution in sample----
#gender x age x education
d <-
  g %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarize(
    n = n(),
    n_weight = sum(weights)
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    prop = n / sum(n),
    prop_weight = n_weight / sum(n_weight)
  ) %>%
  dplyr::left_join(., joint, by = c("gender", "age_acs", "edu_acs")) %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age_acs %in% c("18 to 24 years", "25 to 34 years") ~ "18-34 years",
      age_acs %in% c("35 to 44 years", "45 to 64 years") ~ "35-64 years",
      age_acs == "65 years and over" ~ "$>$65 years"
    )
  ) %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarise(across(n:Freq, ~ sum(.x)))

d$var <- with(d, paste(gender, age_acs, edu_acs, sep = " $\\times$ "))
d <- d[, c(9, 6, 7, 8)]
d <- d[c(3:6, 1, 2, 9:12, 7,8), ]

#income
income.sample <-
  g %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarize(
    n = n(),
    n_weight = sum(weights)
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    prop = n / sum(n),
    prop_weight = n_weight / sum(n_weight)
  ) %>%
  dplyr::left_join(., income.dist, by = "income_acs")
names(income.sample)[1] <- "var"
income.sample <- income.sample[c(5, 2, 3, 4, 1), c(1, 4, 5, 6)]
income.sample$var <- c("$<$\\$20,000", "\\$20,000-39,999", "\\$40,000-59,999", "\\$60,000-99,999", "$>$\\$100,000")
#race
prop.white <- with(g, mean(white))
prop.white.w <- with(g, mean(white * weights))
pop.white <- race.dist %>%
  dplyr::group_by(race_acs) %>%
  dplyr::summarise(Freq = sum(Freq)) %>%
  slice(2)
r <- data.frame(var = "White", prop = prop.white, prop_weight = prop.white.w, Freq = pop.white$Freq)
#bind together
sample.dist <- rbind(d, income.sample, r)
names(sample.dist) <- c("", "Sample", "Weighted", "Population")
sample.dist <- sample.dist[, c(1, 2, 4, 3)]
kableExtra::kbl(sample.dist, booktabs = TRUE, digits = 2, format = "latex", escape = FALSE, linesep = "", align = "lccc",
                caption = "Representiveness of unweighted and weighted sample \\label{tab:sample_desc}",
                position = "t") %>%
  kableExtra::kable_styling(position = "center", font = 10.25) %>%
  kableExtra::pack_rows(escape = FALSE, "Sex/Age/Education", 1, 12) %>%
  kableExtra::pack_rows("Income", 13, 17) %>%
  kableExtra::pack_rows("Race", 18, 18) %>%
  kableExtra::add_footnote(
    paste(
      "Notes:",
      "For exposition, the table collapses the 18-24 and 25-34, and 35-44 and 45-64 age bins together, respectively.",
      "Also not shown is the joint distribution of race/age/sex used to construct weights.",
      "Population data from the 2018 5-Year ACS and cover the primary study site county."
    ),
    threeparttable = TRUE,
    notation = "none"
  ) %>%
  cat(., file = here("output", "tables", "si_tab_3.1.txt"))

#Compare to Gaikwad et al.----
load(here("data", "gaikwadetal", "USData-APSR.rdata"))
glimpse(f)
ff <- subset(f, surveysample_2 == "CoalCountry")
ff <-
  ff %>%
  dplyr::mutate(
    age_tri = dplyr::case_when(
      age_survey >= 18 & age_survey <= 34 ~ "18-34",
      age_survey >= 35 & age_survey <= 54 ~ "35-54",
      age_survey >= 55 ~ "55 or older"
    )
  )
ff$college <- ifelse(ff$education_survey %in% c("College graduate", "Graduate degree"), 1, 0)

ff.df <-
  data.frame(
    "Fair" = c(mean(g$sex), prop.table(table(g$age_tri)),  mean(g$ffemploy), mean(g$fulltime), mean(g$college), mean(g$gw_binary)),
    "Online" = c(prop.table(table(ff$sex))[[2]], prop.table(table(ff$age_tri)), prop.table(table(ff$ffemploy))[[3]],
                        prop.table(table(ff$empl_simple))[[2]],
                        mean(ff$college),
                        prop.table(table(ff$cc_concerned_self_binary))[[1]])
  )
ff.df$Difference <- ff.df$Fair - ff.df$Online
rownames(ff.df) <- c("Female", "18-34", "35-54", "55 or older", "Fossil Fuel Employment", "Employed", "College Degree", "Climate Concern")
ff.df$`$t$-stat` <- with(ff.df, (Fair - Online) / sqrt((Fair *(1-Fair))/248 + (Online * (1-Online))/516))
ff.df$`$p$-value` <- with(ff.df, pt(abs(ff.df$`$t$-stat`), (248+516), lower.tail = FALSE) * 2)


kableExtra::kbl(ff.df, booktabs = TRUE, digits = 2,
                caption = "Comparison of fair and online samples of coal country \\label{tab:sample_compare}", linesep = "", position = "h",
                escape = FALSE, format = "latex") %>%
  kableExtra::kable_styling(position = "center") %>%
  kableExtra::add_header_above(c("", "Sample" = 2, "", "", "")) %>%
  kableExtra::add_footnote(
    paste(
      "Notes:",
      "Two-sided $t$-test that assumes unequal variances.",
      "Online sample from \\textcite{gaikwad2022creating}.",
      "Samples cover different populations. The fair sample includes Southwest Pennsylvania, whereas the online sample includes states outside of that region.",
      "Fair sample size is 248. Online sample size is 516.",
      "For comparison, the fair sample is unweighted (since the online sample includes no weights)."
    ),
    threeparttable = TRUE,
    notation = "none",
    escape = FALSE
  ) %>%
  cat(., file = here("output", "tables", "si_tab_3.2.txt"))

